#### Read in data ####
df <- read_csv("data/dime_cong_elections_current.csv",
               guess_max = 40,000) %>% 
  row_to_names(row_number = 1)

#### Select variables ####
df <- df %>% 
  select(cycle,
         bonica_rid,
         bonica_rid_cycle,
         Name,
         party,
         state,
         seat,
         Incum_Chall,
         cand_gender,
         ppct,
         pwinner,
         gpct,
         gwinner,
         candStatus,
         num_prim_opps,
         distcyc,
         id,
         dcp,
         candidate_inactive) %>% 
  rename(district = distcyc)

#### Filtering ####
df <- df %>% 
  filter(seat == "federal:house" |
           seat == "federal:senate")

#### Clean names ####
df <- df %>% 
  separate(Name, into = c('last', 'first'), sep = ',',
           remove = F) %>% 
  mutate(first = str_trim(first, "left"))

df <- df %>% 
  mutate(lname = word(last, 1),
         fname = word(first, 1))

# Remove extra commas and period
df <- df %>% 
  mutate(fname = str_remove(fname, ","),
         lname = str_remove(lname, ",")) %>% 
  mutate(fname = str_remove(fname, "\\."),
         lname = str_remove(lname, "\\."))

# This keeps just the name AFTER the hyphen in last names
df$lname <- gsub(".*-","",df$lname)

# This keeps just the name BEFORE the hyphen in first names
df$fname <- gsub("-.*","",df$fname)

#### Repair names with only one letter for first name
df <- df %>%
  mutate(fname = case_when(nchar(df$fname) == 1 ~ word(first, 2),
                           nchar(df$fname) != 1 ~ fname))

# Repairs last names beginning with "De"
df <- df %>% 
  mutate(lname = case_when(lname == "DE" ~ last,
                           T ~ lname))

df <- df %>% 
  mutate(lname = case_when(lname == "VAN" ~ last,
                           T ~ lname))

# Remove punctuation 
df$lname <- str_replace_all(df$lname, "[^[:alnum:]]", "") 
df$fname <- str_replace_all(df$fname, "[^[:alnum:]]", "") 

df <- df %>%
  mutate(fname = str_to_title(fname),
         lname = str_to_title(lname))

#### Row-by-row name edits ####
df <- df %>% 
  mutate(lname = case_when(last == "O DONNELL" ~ "O'donnell",
                           T ~ lname),
         lname = case_when(last == "O  ROURKE" ~ "O'rourke",
                           T ~ lname),
         lname = case_when(last == "O ROURKE" ~ "O'rourke",
                           T ~ lname),
         lname = case_when(last == "O CONNOR" ~ "O'connor",
                           T ~ lname),
         lname = case_when(last == "O BRIEN CAROL" ~ "O'brien",
                           T ~ lname),
         lname = case_when(last == "D SILVA" ~ "D'silva",
                           T ~ lname),
         lname = case_when(last == "O  NEILL" ~ "O'neill",
                           T ~ lname),
         lname = case_when(last == "O HARA" ~ "O'hara",
                           T ~ lname),
         lname = case_when(last == "O  DEAR" ~ "O'dear",
                           T ~ lname),
         lname = case_when(last == "L ITALIEN" ~ "L'italien",
                           T ~ lname),
         lname = case_when(last == "O CULL" ~ "O'cull",
                           T ~ lname),
         lname = case_when(last == "D ALESSANDRO" ~ "D'alessandro",
                           T ~ lname),
         lname = case_when(last == "O HALLERAN" ~ "O'halleran",
                           T ~ lname),
         lname = case_when(last == "D ANNUNZIO" ~ "D'annuzio",
                           T ~ lname),
         lname = case_when(last == "o NEILL" ~ "O'neill",
                           T ~ lname),
         lname = case_when(last == "o BRIANT" ~ "O'bryant",
                           T ~ lname),
         lname = case_when(last == "D ORAZIO" ~ "D'orazio",
                           T ~ lname),
         lname = case_when(last == "D AMBOISE" ~ "D'amboise",
                           T ~ lname))

# Remove punctuation 
df$lname <- str_replace_all(df$lname, "[^[:alnum:]]", "") 
df$fname <- str_replace_all(df$fname, "[^[:alnum:]]", "") 

#### First name commonality ####
babynames <- babynames %>% 
  filter(year == 1950) %>% 
  rename(fname = name) %>% 
  select(-year)

df <- left_join(df, babynames, by = c("fname", "cand_gender" = "sex")) %>% 
  rename(fprop = prop,
         fn = n)

df$fprop[is.na(df$fprop)] <- 0
df$fn[is.na(df$fn)] <- 0

rm(babynames)

babynames <- babynames %>% 
  filter(year == 1930) %>% 
  rename(fname = name) %>% 
  select(-year)

df <- left_join(df, babynames, by = c("fname", "cand_gender" = "sex")) %>% 
  rename(fprop30 = prop,
         fn30 = n)

df$fprop[is.na(df$fprop30)] <- 0
df$fn[is.na(df$fn30)] <- 0

rm(babynames)

babynames <- babynames %>% 
  filter(year == 1940) %>% 
  rename(fname = name) %>% 
  select(-year)

df <- left_join(df, babynames, by = c("fname", "cand_gender" = "sex")) %>% 
  rename(fprop40 = prop,
         fn40 = n)

df$fprop40[is.na(df$fprop40)] <- 0
df$fn40[is.na(df$fn40)] <- 0

rm(babynames)

babynames <- babynames %>% 
  filter(year == 1960) %>% 
  rename(fname = name) %>% 
  select(-year)

df <- left_join(df, babynames, by = c("fname", "cand_gender" = "sex")) %>% 
  rename(fprop60 = prop,
         fn60 = n)

df$fprop60[is.na(df$fprop60)] <- 0
df$fn60[is.na(df$fn60)] <- 0

rm(babynames)

#### Surname commonality ####
census_surnames <- read.table("data/census-names/dist.all.last.txt",
                              col.names = c('surname', 'percent_freq', 'cum_freq', 'rank'))
census_surnames <- census_surnames %>%
  mutate(last_clean = tolower(surname)) %>% 
  mutate(lname = str_to_title(last_clean)) %>% 
  select(-cum_freq,
         -surname)

df <- left_join(df, census_surnames, by = 'lname')

df$percent_freq[is.na(df$percent_freq)] <- 0

rm(census_surnames)

#### Syllable and character counts ####
t <- readability(df$lname)
t <- t %>% 
  select(-re,
         -gl,
         -ari,
         -smog,
         -nonwords,
         -sents,
         -polys, 
         -words,
         -wordchars) %>% 
  rename(lchars = chars,
         lsylls = sylls,
         lcl = cl)

df <- bind_cols(df, t)

t2 <- readability(df$fname)
t2 <- t2 %>% 
  select(-re,
         -gl,
         -ari,
         -smog,
         -nonwords,
         -sents,
         -polys, 
         -words,
         -wordchars) %>% 
  rename(fchars = chars,
         fsylls = sylls,
         fcl = cl)

df <- bind_cols(df, t2)
rm(t,t2)


#### Imputing race ####
candidates <- df %>% 
  select(fname,
         lname,
         cand_gender,
         state) %>% 
  mutate(surname = lname) %>% 
  mutate(gender = case_when(cand_gender == "F" ~ 1,
                            cand_gender == "M" ~ 0))

# get_census_api_2("https://api.census.gov/data/2010/sf1?",
#                  '02925aecfc32b58a286e7a6287f587ead97ea2e0', get, region, retry = 0)

candidates <- candidates %>% 
  na.omit()

candidates <- candidates %>% 
  mutate(sex = case_when(gender == "female" ~ 1,
                         gender == "male" ~ 0))

x <- predict_race(candidates,
                  surname.only = T,
                  surname.year = 2000,
                  sex = T)

x <- x %>%
  select(lname,
         fname,
         gender,
         pred.whi,
         pred.bla,
         pred.his,
         pred.asi,
         pred.oth) %>% 
  mutate(gender = as.character(gender)) %>% 
  mutate(cand_gender = case_when(gender == 1 ~ "F",
                                 gender == 0 ~ "M"))


df <- left_join(df,x, by = c("lname", "fname", "cand_gender")) %>% 
  distinct()

rm(candidates,x)

#### Add algorithm ratings ####
sen_first <- read_csv("data/algorithm-ratings/SenateNamesFirstRatings.csv")
sen_last <- read_csv("data/algorithm-ratings/SenateNamesLastRatings.csv")

df <- left_join(df, sen_first, by = "fname")
df <- left_join(df, sen_last, by = "lname")

df <- df %>% 
  mutate(first_algorithm = (first_letters + first_phonemes)/2,
         last_algorithm = (last_letters + last_phonemes)/2)

df <- df %>% 
  mutate(first_algorithm = first_algorithm*(-1),
         last_algorithm = last_algorithm*(-1),
         first_letters = first_letters*(-1),
         first_phonemes = first_phonemes*(-1),
         last_letters = last_letters*(-1),
         last_phonemes = last_phonemes*(-1))

#### Clean up model variables ####
df <- df %>% 
  mutate(incumbent = case_when(Incum_Chall == "C" |
                                 Incum_Chall == "O" ~ 0,
                               Incum_Chall == "I" ~ 1))

df <- df %>% 
  mutate(female = case_when(cand_gender == "F" ~ 1,
                            cand_gender == "M" ~ 0))

df <- df %>% 
  rename(year = cycle)

df <- df %>% 
  mutate(race = case_when(seat == "federal:senate" ~ "Senate",
                          seat == "federal:house" ~ "House"))

df <- df %>% 
  mutate(pwin = case_when(pwinner == "L" ~ 0,
                          pwinner == "W" ~ 1)) %>% 
  mutate(gwin = case_when(gwinner == "L" ~ 0,
                          gwinner == "W" ~ 1))
  
df <- df %>% 
  mutate(white = case_when(pred.whi > .5 ~ 1,
                           TRUE ~ 0),
         black = case_when(pred.bla > .5 ~ 1,
                           TRUE ~ 0),
         hisp = case_when(pred.his > .5 ~ 1,
                          TRUE ~ 0),
         asian = case_when(pred.asi > .5 ~ 1,
                           TRUE ~ 0),
         race_unsure = case_when(white == 0 &
                                   black == 0 &
                                   hisp == 0 &
                                   asian == 0 ~ 1,
                                 TRUE ~ 0))

df <- df %>% 
  mutate(gpct = as.numeric(gpct),
         ppct = as.numeric(ppct))

df <- df %>% 
  mutate(ppct = ppct*100,
         gpct = gpct*100)

#### Save out data ####
write_csv(df, "data/analysis/dime-clean-analysis.csv")
rm(list = ls())